Import library

library(dada2)
packageVersion("dada2") # check dada2 version
## [1] '1.21.0'
library(Biostrings)
library(ShortRead)
library(seqTools) # per base sequence content
library(phyloseq)
library(ggplot2)
library(data.table)
library(plyr)
library(dplyr)
library(qckitfastq) # per base sequence content
library(stringr)

QUALITY CHECK

First, we import the fastq files containing the raw reads. The samples were downloaded from the SRA database with the accession number PRJEB37924. Only sigmoid biopsy samples that were analyzed by 16S rRNA sequencing were downloaded (see 00_Metadata-Mars).

# Save the path to the directory containing the fastq zipped files
path <- "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/Mars-2020"
# list.files(path) # check we are in the right directory

# fastq filenames have format: SAMPLENAME.fastq.gz
# Saves the whole directory path to each file name
fnFs <- sort(list.files(file.path(path, "original"), pattern="_1.fastq.gz", full.names = TRUE)) # FWD reads
fnRs <- sort(list.files(file.path(path, "original"), pattern="_2.fastq.gz", full.names = TRUE)) # REV reads
show(fnFs[1:5])
show(fnRs[1:5])

# Extract sample names, assuming filenames have format: SAMPLENAME.fastq.gz
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
show(sample.names[1:5]) # saves only the file name (without the path)

# Look at quality of all files
for (i in 1:3){  # 1:length(fnFs)
  show(plotQualityProfile(fnFs[i]))
  show(plotQualityProfile(fnRs[i]))
}

# Look at nb of reads per sample
# raw_stats <- data.frame('sample' = sample.names,
#                         'reads' = fastqq(fnFs)@nReads)
# min(raw_stats$reads)
# max(raw_stats$reads)
# mean(raw_stats$reads)

We will have a quick peak at the per base sequence content of the reads in some samples, to make sure there is no anomaly (i.e. all reads having the same sequence).

# Look at per base sequence content (forward read)
fseqF1 <- seqTools::fastqq(fnFs[10])
## [fastqq] File ( 1/1) '/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/Mars-2020/original/ERR5169592_1.fastq.gz'    done.
rcF1 <- read_content(fseqF1)
plot_read_content(rcF1) + labs(title = "Per base sequence content - Forward read")

fseqR1 <- seqTools::fastqq(fnRs[10])
## [fastqq] File ( 1/1) '/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/Mars-2020/original/ERR5169592_2.fastq.gz'    done.
rcR1 <- read_content(fseqR1)
plot_read_content(rcR1) + labs(title = "Per base sequence content - Reverse read")

LOOK FOR PRIMERS

Now, we will look whether the reads still contain the primers. The methods section indicates that the V4 region of the 16S rRNA gene was amplified as described in the paper by Gohl et al.. Primer sequences are shared in that paper.

# V4
FWD <- "GTGCCAGCMGCCGCGGTAA"  # F515 forward primer sequence
REV <- "GGACTACHVGGGTWTCTAAT" # R806 reverse primer sequence 

# Function that, from the primer sequence, will return all combinations possible (complement, reverse complement, ...)
allOrients <- function(primer) {
    # Create all orientations of the input sequence
    require(Biostrings)
    dna <- DNAString(primer)  # The Biostrings works w/ DNAString objects rather than character vectors
    orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna), 
        RevComp = reverseComplement(dna))
    return(sapply(orients, toString))
}

# Get all combinations of the primer sequences
FWD.orients <- allOrients(FWD) # F515
REV.orients <- allOrients(REV) # R806
FWD.orients # sanity check
##               Forward            Complement               Reverse 
## "GTGCCAGCMGCCGCGGTAA" "CACGGTCGKCGGCGCCATT" "AATGGCGCCGMCGACCGTG" 
##               RevComp 
## "TTACCGCGGCKGCTGGCAC"
REV.orients
##                Forward             Complement                Reverse 
## "GGACTACHVGGGTWTCTAAT" "CCTGATGDBCCCAWAGATTA" "TAATCTWTGGGVHCATCAGG" 
##                RevComp 
## "ATTAGAWACCCBDGTAGTCC"
# Function that counts number of reads in which a sequence is found
primerHits <- function(primer, fn) {
    nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE, max.mismatch = 2)
    return(sum(nhits > 0))
}

# Get a table to know how many times the U515 and E786 primers are found in the reads of each sample
for (i in 1:3){
  cat("SAMPLE", sample.names[i], "with total number of", raw_stats[i,'reads'], "reads\n\n")
  x <- rbind(ForwardRead.FWDPrimer = sapply(FWD.orients, primerHits, fn = fnFs[[i]]),
             ForwardRead.REVPrimer = sapply(REV.orients, primerHits, fn = fnFs[[i]]),
             ReverseRead.FWDPrimer = sapply(FWD.orients, primerHits, fn = fnRs[[i]]), 
             ReverseRead.REVPrimer = sapply(REV.orients, primerHits, fn = fnRs[[i]]))
  print(x)
  cat("\n____________________________________________\n\n")
}
## SAMPLE ERR5169583 with total number of 105593 reads
## 
##                       Forward Complement Reverse RevComp
## ForwardRead.FWDPrimer  102879          0       0       4
## ForwardRead.REVPrimer       4          0       0   96739
## ReverseRead.FWDPrimer       4          0       0   83295
## ReverseRead.REVPrimer  103571          0       0       3
## 
## ____________________________________________
## 
## SAMPLE ERR5169584 with total number of 129697 reads
## 
##                       Forward Complement Reverse RevComp
## ForwardRead.FWDPrimer  125832          0       0      13
## ForwardRead.REVPrimer      13          0       0  118601
## ReverseRead.FWDPrimer      13          0       0  103070
## ReverseRead.REVPrimer  127174          0       0      13
## 
## ____________________________________________
## 
## SAMPLE ERR5169585 with total number of 70485 reads
## 
##                       Forward Complement Reverse RevComp
## ForwardRead.FWDPrimer   68860          0       0       1
## ForwardRead.REVPrimer       1          0       0   64299
## ReverseRead.FWDPrimer       1          0       0   55897
## ReverseRead.REVPrimer   69315          0       0       1
## 
## ____________________________________________

FILTER AND TRIM

1. Primer removal

Almost all reads contain the forward & reverse primers. Ideally, to follow the standardized pipeline, we should (1) filter out reads not containing any primer and (2) trim the primers. However, the fastq files deposited on the SRA/ENA databases do not contain Illumina headers. Thus, during the merging step of paired reads later on, the Dada2 package won’t be able to match paired-reads.

We will simply perform a quality filtering of the reads, and trim the first 20 and last 30 bases of the reads (containing the primers)

2. Quality filtering

Then, we perform a quality filtering of the reads.

# Place filtered files in a filtered/ subdirectory
FWD.filt2_samples <- file.path(path, "filtered2", paste0(sample.names, "_F_filt2.fastq.gz")) # FWD reads
REV.filt2_samples <- file.path(path, "filtered2", paste0(sample.names, "_R_filt2.fastq.gz")) # REV reads
# Assign names for the filtered fastq.gz files
names(FWD.filt2_samples) <- sample.names
names(REV.filt2_samples) <- sample.names

# Filter
out2 <- filterAndTrim(fwd = fnFs, filt = FWD.filt2_samples,
                      rev = fnRs, filt.rev = REV.filt2_samples,
                      trimLeft = 20, trimRight=30, # trim primers
                      maxEE=3, # reads with more than 3 expected errors (sum(10e(-Q/10))) are discarded
                      truncQ=10, # Truncate reads at the first instance of a quality score less than or equal to truncQ.
                      minLen=150, # Discard reads shorter than 150 bp.
                      compress=TRUE,
                      multithread = TRUE,
                      verbose=TRUE)

Let’s look at the output filtered fastq files as sanity check.

out2[1:6,] # show how many reads were filtered in each file
##                       reads.in reads.out
## ERR5169583_1.fastq.gz   105593     67882
## ERR5169584_1.fastq.gz   129697     84891
## ERR5169585_1.fastq.gz    70485     46101
## ERR5169586_1.fastq.gz    86720     56496
## ERR5169587_1.fastq.gz    88688     58512
## ERR5169588_1.fastq.gz    44429     21535
# Look at quality profile of all filtered files
for (i in 1:3){
  show(plotQualityProfile(FWD.filt2_samples[i]))
  show(plotQualityProfile(REV.filt2_samples[i]))
}

LEARN ERROR RATES

Now we will build the parametric error model, to be able to infer amplicon sequence variants (ASVs) later on.

errF <- learnErrors(FWD.filt2_samples, multithread=TRUE, randomize=TRUE, verbose = 1)
errR <- learnErrors(REV.filt2_samples, multithread=TRUE, randomize=TRUE, verbose = 1)

The error rates for each possible transition (A→C, A→G, …) are shown. Points are the observed error rates for each consensus quality score. The black line shows the estimated error rates after convergence of the machine-learning algorithm. The red line shows the error rates expected under the nominal definition of the Q-score. Here the estimated error rates (black line) are a good fit to the observed rates (points), and the error rates drop with increased quality as expected.

plotErrors(errF, nominalQ = TRUE) # Forward reads

plotErrors(errR, nominalQ = TRUE) # Reverse reads

CONSTRUCT SEQUENCE TABLE

1. Infer sample composition

The dada() algorithm infers sequence variants based on estimated errors (previous step). Firstly, we de-replicate the reads in each sample, to reduce the computation time. De-replication is a common step in almost all modern ASV inference (or OTU picking) pipelines, but a unique feature of derepFastq is that it maintains a summary of the quality information for each dereplicated sequence in $quals.

# Dereplicate the reads in the sample
derepF <- derepFastq(FWD.filt2_samples) # forward
derepR <- derepFastq(REV.filt2_samples) # reverse

# Infer sequence variants
dadaFs <- dada(derepF, err=errF, multithread=TRUE) # forward
dadaRs <- dada(derepR, err=errR, multithread=TRUE) # reverse
# Inspect the infered sequence variants
for (i in 1:3){
  print(dadaFs[[i]])
  print(dadaRs[[i]])
  print("________________")
}
## dada-class: object describing DADA2 denoising results
## 217 sequence variants were inferred from 13852 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## dada-class: object describing DADA2 denoising results
## 134 sequence variants were inferred from 16309 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 226 sequence variants were inferred from 16473 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## dada-class: object describing DADA2 denoising results
## 125 sequence variants were inferred from 19321 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 188 sequence variants were inferred from 9899 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## dada-class: object describing DADA2 denoising results
## 111 sequence variants were inferred from 11937 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"

2. Merge paired-end reads

We now need to merge paired reads.

mergers <- mergePairs(dadaFs, derepF, dadaRs, derepR, verbose=TRUE)
head(mergers[[1]])

3. Construct ASV table

We can now construct an amplicon sequence variant table (ASV) table, a higher-resolution version of the OTU table produced by traditional methods.

# Make sequence table from the infered sequence variants
seqtable <- makeSequenceTable(mergers)

# We should have 72 samples (72 rows)
dim(seqtable)
## [1]   72 3733
# Inspect distribution of sequence lengths
hist(nchar(getSequences(seqtable)), breaks = 100, xlab = "ASV length", ylab = "Number of ASVs", main="")

# Sequences should be between 515F - 806R, so around ~250bp (removing the primers lengths).
# Remove any sequence variant below outside 200-300bp
seqtable.new <- seqtable[,nchar(colnames(seqtable)) %in% 200:300]
dim(seqtable.new)
## [1]   72 1738
hist(nchar(getSequences(seqtable.new)), breaks = 100, xlab = "ASV length", ylab = "Number of ASVs", main="")

4. Remove chimeras

The core dada method corrects substitution and indel errors, but chimeras remain. Fortunately, the accuracy of sequence variants after denoising makes identifying chimeric ASVs simpler than when dealing with fuzzy OTUs. Chimeric sequences are identified if they can be exactly reconstructed by combining a left-segment and a right-segment from two more abundant “parent” sequences.

seqtable.nochim <- removeBimeraDenovo(seqtable.new, method="consensus", multithread=TRUE, verbose=TRUE)

# Check how many sequence variants we have after removing chimeras
dim(seqtable.nochim)
## [1]   72 1678
# Check how many reads we have after removing chimeras (we should keep the vast majority of the reads, like > 80%)
sum(seqtable.nochim)/sum(seqtable.new)
## [1] 0.9963381

LOOK AT READS COUNT THROUGH PIPELINE

Sanity check before assigning taxonomy.

# Function that counts nb of reads
getN <- function(x) sum(getUniques(x))

# Table that will count number of reads for each process of interest (input reads, filtered reads, denoised reads, non chimera reads)
track <- cbind(out2,
               sapply(dadaFs, getN),
               sapply(dadaRs, getN),
               sapply(mergers, getN),
               rowSums(seqtable.nochim),
               lapply(rowSums(seqtable.nochim)*100/out2[,1], as.integer))

# Assign column and row names
colnames(track) <- c("input", "quality-filt", "denoisedF", "denoisedR", 'merged', 'nonchim', "%input->output")
rownames(track) <- sample.names

# Show final table: for each row/sample, we have shown the initial number of reads, filtered reads, denoised reads, and non chimera reads
track
##            input  quality-filt denoisedF denoisedR merged nonchim
## ERR5169583 105593 67882        67514     67468     64359  46170  
## ERR5169584 129697 84891        84386     84242     81354  63684  
## ERR5169585 70485  46101        45855     45742     44299  36610  
## ERR5169586 86720  56496        56148     56038     53806  45822  
## ERR5169587 88688  58512        58086     58008     55905  38849  
## ERR5169588 44429  21535        21249     21253     20536  20392  
## ERR5169589 97536  61839        61359     61297     60215  27380  
## ERR5169590 113298 73704        73246     73047     69120  46633  
## ERR5169591 70662  45311        44986     44815     41039  39904  
## ERR5169592 84848  53043        52654     52515     50040  35368  
## ERR5169593 70070  45741        45278     45200     43624  33508  
## ERR5169594 70514  46313        46066     45968     44669  33429  
## ERR5169595 81234  51884        51578     51518     49541  45070  
## ERR5169596 62654  39902        39785     39663     37516  35740  
## ERR5169597 123998 82637        82288     82277     81354  47033  
## ERR5169598 105876 71672        71412     71414     70545  26973  
## ERR5169599 105644 72969        72674     72648     72158  6350   
## ERR5169600 79270  50487        50177     50177     47739  46312  
## ERR5169601 76041  47762        47346     47320     45669  37142  
## ERR5169602 87122  59028        58789     58775     58113  4073   
## ERR5169603 73742  48317        48114     48084     47010  37223  
## ERR5169604 74329  46552        46278     46151     44192  40664  
## ERR5169605 55292  36072        35901     35734     34195  29936  
## ERR5169606 101878 66803        66575     66609     64616  61512  
## ERR5169607 83029  52595        52324     52211     50107  43446  
## ERR5169608 82800  56168        55822     55799     54485  18688  
## ERR5169609 73407  48392        48111     48109     45666  42662  
## ERR5169610 69347  47775        47518     47440     46274  20812  
## ERR5169611 70806  45827        45462     45284     43149  36655  
## ERR5169612 77124  52468        52161     52163     51271  19178  
## ERR5169613 68381  43427        43265     43233     41903  38519  
## ERR5169614 74287  46555        46216     46183     44623  34923  
## ERR5169615 64188  41889        41656     41529     39654  34861  
## ERR5169616 63924  40179        40011     39896     37615  30415  
## ERR5169617 91806  63077        62795     62710     61169  19465  
## ERR5169618 89209  60319        60007     60032     57906  27654  
## ERR5169619 1633   701          680       633       562    562    
## ERR5169620 72249  44767        44512     44430     42735  26892  
## ERR5169621 86114  55549        55331     55240     52425  35200  
## ERR5169622 52406  34619        34458     34469     32990  28419  
## ERR5169623 97958  61968        61563     61422     59478  41553  
## ERR5169624 65330  41531        41278     41111     38971  35218  
## ERR5169625 55228  34342        34109     34007     32495  26377  
## ERR5169626 75144  45993        45734     45536     42554  39393  
## ERR5169627 65796  41340        41062     40964     38875  33949  
## ERR5169628 67956  41265        40873     40627     37816  31093  
## ERR5169629 72891  47614        47288     47245     45526  34920  
## ERR5169630 83538  54506        54174     54242     51372  46832  
## ERR5169631 67169  42595        42353     42336     41079  33441  
## ERR5169632 53835  34606        34419     34322     32417  29662  
## ERR5169633 52155  34003        33830     33732     32101  24020  
## ERR5169634 76435  48921        48598     48472     45800  42907  
## ERR5169635 53442  34376        34049     33930     31764  27063  
## ERR5169636 41211  26655        26514     26453     25099  22439  
## ERR5169637 46747  29274        29102     29119     27417  26349  
## ERR5169638 76795  46229        45999     45990     45442  32566  
## ERR5169639 71235  42595        42413     42382     40974  30186  
## ERR5169640 39718  23558        23388     23360     22604  15252  
## ERR5169641 68115  44439        44340     44329     43235  36168  
## ERR5169642 17042  11307        11165     11140     11037  421    
## ERR5169643 26208  16351        16162     16060     15450  12128  
## ERR5169644 64295  40761        40620     40574     39237  32567  
## ERR5169645 50906  30554        30265     30319     28395  26992  
## ERR5169646 73257  47766        47571     47625     46251  31675  
## ERR5169647 8075   5306         5216      5196      5124   247    
## ERR5169648 7421   4659         4464      4441      4124   1955   
## ERR5169649 47727  31004        30826     30859     28775  27661  
## ERR5169650 75469  49539        49283     49101     46863  43825  
## ERR5169651 17381  10780        10699     10526     9696   5338   
## ERR5169652 14220  8886         8816      8677      8159   7661   
## ERR5169653 49556  32426        32132     32058     30825  26002  
## ERR5169654 83283  56457        55932     55870     54652  22463  
##            %input->output
## ERR5169583 43            
## ERR5169584 49            
## ERR5169585 51            
## ERR5169586 52            
## ERR5169587 43            
## ERR5169588 45            
## ERR5169589 28            
## ERR5169590 41            
## ERR5169591 56            
## ERR5169592 41            
## ERR5169593 47            
## ERR5169594 47            
## ERR5169595 55            
## ERR5169596 57            
## ERR5169597 37            
## ERR5169598 25            
## ERR5169599 6             
## ERR5169600 58            
## ERR5169601 48            
## ERR5169602 4             
## ERR5169603 50            
## ERR5169604 54            
## ERR5169605 54            
## ERR5169606 60            
## ERR5169607 52            
## ERR5169608 22            
## ERR5169609 58            
## ERR5169610 30            
## ERR5169611 51            
## ERR5169612 24            
## ERR5169613 56            
## ERR5169614 47            
## ERR5169615 54            
## ERR5169616 47            
## ERR5169617 21            
## ERR5169618 30            
## ERR5169619 34            
## ERR5169620 37            
## ERR5169621 40            
## ERR5169622 54            
## ERR5169623 42            
## ERR5169624 53            
## ERR5169625 47            
## ERR5169626 52            
## ERR5169627 51            
## ERR5169628 45            
## ERR5169629 47            
## ERR5169630 56            
## ERR5169631 49            
## ERR5169632 55            
## ERR5169633 46            
## ERR5169634 56            
## ERR5169635 50            
## ERR5169636 54            
## ERR5169637 56            
## ERR5169638 42            
## ERR5169639 42            
## ERR5169640 38            
## ERR5169641 53            
## ERR5169642 2             
## ERR5169643 46            
## ERR5169644 50            
## ERR5169645 53            
## ERR5169646 43            
## ERR5169647 3             
## ERR5169648 26            
## ERR5169649 57            
## ERR5169650 58            
## ERR5169651 30            
## ERR5169652 53            
## ERR5169653 52            
## ERR5169654 26

ASSIGN TAXONOMY

Extensions: The dada2 package also implements a method to make species level assignments based on exact matching between ASVs and sequenced reference strains. Recent analysis suggests that exact matching (or 100% identity) is the only appropriate way to assign species to 16S gene fragments. Currently, species-assignment training fastas are available for the Silva and RDP 16S databases. To follow the optional species addition step, download the silva_species_assignment_v132.fa.gz file, and place it in the directory with the fastq files.

# Assign taxonomy (with silva v138)
taxa <- assignTaxonomy(seqtable.nochim, "~/Projects/IBS_Meta-analysis_16S/data/silva_nr_v138_train_set.fa.gz",
                       tryRC = TRUE, # try reverse complement of the sequences
                       multithread=TRUE, verbose = TRUE)

# Add species assignment
taxa <- addSpecies(taxa, "~/Projects/IBS_Meta-analysis_16S/data/silva_species_assignment_v138.fa.gz")
# Check how the taxonomy table looks like
taxa.print <- taxa 
rownames(taxa.print) <- NULL # Removing sequence rownames for display only
head(taxa.print)
##      Kingdom    Phylum         Class         Order            
## [1,] "Bacteria" "Firmicutes"   "Clostridia"  "Lachnospirales" 
## [2,] "Bacteria" "Bacteroidota" "Bacteroidia" "Bacteroidales"  
## [3,] "Bacteria" "Firmicutes"   "Clostridia"  "Lachnospirales" 
## [4,] "Bacteria" "Firmicutes"   "Clostridia"  "Oscillospirales"
## [5,] "Bacteria" "Firmicutes"   "Clostridia"  "Oscillospirales"
## [6,] "Bacteria" "Firmicutes"   "Clostridia"  "Lachnospirales" 
##      Family            Genus              Species         
## [1,] "Lachnospiraceae" "Blautia"          NA              
## [2,] "Bacteroidaceae"  "Bacteroides"      "vulgatus"      
## [3,] "Lachnospiraceae" "Agathobacter"     NA              
## [4,] "Ruminococcaceae" "Faecalibacterium" NA              
## [5,] "Ruminococcaceae" "Faecalibacterium" "prausnitzii"   
## [6,] "Lachnospiraceae" "Fusicatenibacter" "saccharivorans"
table(taxa.print[,1]) # Show the different kingdoms (should be only bacteria)
## 
##   Archaea  Bacteria Eukaryota 
##         4      1560        74
taxa.print[taxa.print[,1] == 'Eukaryota',2] # the eukaryota are all NAs
##   [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
##  [26] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
##  [51] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
##  [76] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [101] NA NA NA NA NA NA NA NA NA NA NA NA NA NA
table(taxa.print[,2]) # Show the different phyla
## 
##  Abditibacteriota   Acidobacteriota  Actinobacteriota      Bacteroidota 
##                 1                 3                81               202 
##  Campilobacterota     Cyanobacteria  Desulfobacterota     Euryarchaeota 
##                 7                19                13                 2 
##    Fibrobacterota        Firmicutes    Fusobacteriota   Patescibacteria 
##                 1              1072                22                12 
##    Proteobacteria     Spirochaetota      Synergistota  Thermoplasmatota 
##               108                 4                 3                 2 
##      Thermotogota Verrucomicrobiota 
##                 1                 8
table(is.na(taxa.print[,2])) # is there any NA phyla?
## 
## FALSE  TRUE 
##  1561   117

LAST PREPROCESSING

We will remove any sample with less than 500 reads from further analysis, and also any ASVs with unassigned phyla.

1. Create phyloseq object

The preprocessing will be easier to do with ASV, taxonomic and metadata tables combined in a phyloseq object.

#_________________________
# Import metadata
metadata_table <- read.csv(file.path(path, "00_Metadata-Mars/modif_metadata(R).csv")) %>% select(-X)
colnames(metadata_table)
rownames(metadata_table) <- metadata_table$Run

# Add a few covariates
metadata_table$author <- "Mars"
metadata_table$sequencing_tech <- "Illumina"
metadata_table$variable_region <- "V4"

# Remove _F_filt2.fastq.gz from rownames
# rownames(seqtable.nochim)[1:5]
rownames(seqtable.nochim) <- sub("_.*", "", rownames(seqtable.nochim))

#_________________________
# Create phyloseq object
physeq <- phyloseq(otu_table(seqtable.nochim, taxa_are_rows=FALSE), # by default, in otu_table the sequence variants are in rows
                  sample_data(metadata_table), 
                  tax_table(taxa))

# Remove taxa that are eukaryota, or have unassigned Phyla
physeq <- subset_taxa(physeq, Kingdom != "Eukaryota")
physeq <- subset_taxa(physeq, !is.na(Phylum))
# Remove samples with less than 500 reads
physeq <- prune_samples(sample_sums(physeq)>=500, physeq)

2. Save to disk

# Save to disk
saveRDS(raw_stats, file.path(path, "01_Dada2-Mars/raw_stats.rds"))
saveRDS(out2, file.path(path, "01_Dada2-Mars/out2.rds"))
saveRDS(errF, file.path(path, "01_Dada2-Mars/errF.rds"))
saveRDS(errR, file.path(path, "01_Dada2-Mars/errR.rds"))
saveRDS(dadaFs, file.path(path, "01_Dada2-Mars/infered_seq_F.rds"))
saveRDS(dadaRs, file.path(path, "01_Dada2-Mars/infered_seq_R.rds"))
saveRDS(mergers, file.path(path, "01_Dada2-Mars/mergers.rds"))

saveRDS(otu_table(physeq), file.path(path, "01_Dada2-Mars/ASVtable_final.rds")) # ASV table
saveRDS(tax_table(physeq), file.path(path, "01_Dada2-Mars/taxa_final.rds")) # taxa table
saveRDS(sample_data(physeq), file.path(path, "01_Dada2-Mars/metadata_final.rds")) # metadata table

# Taxa & Phyloseq object
saveRDS(taxa, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/CLUSTER/Taxonomy/output/taxa_mars.rds")
saveRDS(physeq, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/CLUSTER/PhyloTree/input/physeq_mars.rds")

3. Quick peek at data analysis

# Absolute abundance
plot_bar(physeq, fill = "Phylum")+ facet_wrap("host_disease", scales="free_x") + theme(axis.text.x = element_blank())

# Relative abundance for Phylum
phylum.table <- physeq %>%
  tax_glom(taxrank = "Phylum") %>%                     # agglomerate at phylum level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  psmelt()                                             # Melt to long format

ggplot(phylum.table, aes(x = Sample, y = Abundance, fill = Phylum))+
  facet_wrap(~ host_disease, scales = "free") + # scales = "free" removes empty lines
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(size = 5, angle = -90))+
  labs(x = "Samples", y = "Relative abundance")